clear
set more off
macro drop all
capture log close

/********************************************************************************
Discrimination in Multi-Phase Systems: Evidence from Child Protection
Clean the Allegations Data

Created on: 11/28/17

Last Modified on: 2/20/2024

Description: This file cleans the raw allegations data.

Note that we have removed the file directory names from this program for 
confidentiality reasons.
********************************************************************************/

** Setting the Directory
global rawdata 
global cleandata 
global tmp 

/********************************************************************************

The raw allegations data should be unique at the case*victim*allegation*perpetrator
level, although there are some duplicates.  Eventually, I'll want to use a cleaned
version of this data to construct a child*case level dataset so I need to make the
raw data more manageable to work with first by create a file which is unique at
the case*alleg*child*perp level.

There are 3 sections of this do file:
1) Clean Variables
2) Collapse to Child*Case*Allegation*Perp level
3) Collapse to child*case level 

*******************************************************************************/


*************************
**1) CLEAN VARIABLES
*************************

**Load raw allegations data
use "${rawdata}allegations.dta", clear

**Rename and label variables
rename intakechildvicpartyid vicid
la var vicid "Child Victim ID"

rename investigation_caseid inv_caseid
la var inv_caseid "Investigation Case ID"

rename srcinvestcase src_caseid
la var src_caseid "SRC Case ID"

rename ongoing_caseid ong_caseid
la var ong_caseid "Ongoing Case ID"

rename allegationtypedesc alleg
la var alleg "Allegation"

rename findingdesc prepond
la var prepond "Preponderence"

rename intakeperppartyid perpid
la var perpid "Perpetrator ID"

rename relationtype relation
la var relation "Relationship between Victim and Perpetrator"

rename relationtype2 relation2
la var relation2 "Alt Relationship between Victim and Perpetrator"

**Clean the relation variable

*****the relation variables are not consistent in that sometimes relation is
*****mother and relation2 is son while other times relation is son and relation2 is 
*****mother. I only want to keep the relation of the perpetrator (eg. mother), so I'll
*****create a new variable with this information.
gen perp_relation=""
la var perp_relation "Relationship of Perpetrator to Child Victim"

**Mother
foreach x in "Adoptive Mother" "Biological Mother" "Legal Mother" "Stepmother" ///
	 {
	replace perp_relation="Mother" if relation=="`x'" | relation2=="`x'"
}

**Father
foreach x in "Adoptive Father" "Biological Father" "Legal Father" "Putative Father" ///
	"Stepfather" {
	replace perp_relation="Father" if relation=="`x'" | relation2=="`x'"
}

**Parent (unknown if it's mother or father)
foreach x in "Adoptive Daughter" "Adoptive Son" "Biological Son" "Biological Daughter" {
	replace perp_relation="Parent (unknown M or F)" if perp_relation=="" & (relation=="`x'" | relation2=="`x'") 
}

**Not related
foreach x in "Foster Brother" "Foster Daughter" "Foster Father" "Foster Mother" ///
	"Foster Sister" "Foster Son" "Living Together Partner" "Not Related" ///
	"Significant Other -out of home" "Unmarried Couple" "Guardian" "Child (guardianship)" ///
	"Former Significant Other" {
	replace perp_relation="Not Related" if perp_relation=="" & (relation=="`x'" | relation2=="`x'")
}

**Related
replace perp_relation="Relative" if perp_relation=="" & ((relation!="" & relation!="Mapping Default" ///
& relation!="NO MATCH FOUND" & relation!="Unknown") | (relation2!="" & relation2!="Mapping Default" ///
& relation2!="NO MATCH FOUND" & relation2!="Unknown"))

drop relation relation2 

**Drop observations which contain no information (there are about 750,000 of these
**for some reason).
drop if alleg=="" & prepond=="" & perpid==0 & perp_relation==""

*************************
**2) COLLAPSE TO CHILD*CASE*ALLEG*PERP LEVEL
*************************

duplicates report vicid inv_caseid alleg perpid

/*
--------------------------------------
   copies | observations       surplus
----------+---------------------------
        1 |      4771051             0
        2 |         1696           848
        3 |            3             2
--------------------------------------
*/

duplicates tag vicid inv_caseid alleg perpid, gen(dups)

**The main reason for duplicates here appears to be a conflict in whether there
**was preponderance. Let's assume that the hierarchy is No Evidence, No preponderance
**and preponderance and always keep the value that is highest in the assumed hierarchy.
gen prepond_alt=0
replace prepond_alt=1 if prepond=="No Preponderance"
replace prepond_alt=2 if prepond=="Preponderance"
bysort vicid inv_caseid alleg perpid: egen prepond_alt_max=max(prepond_alt) if dups>0
replace prepond="No Evidence" if prepond_alt_max==0 & dups>0
replace prepond="No Preponderance" if prepond_alt_max==1 & dups>0
replace prepond="Preponderance" if prepond_alt_max==2 & dups>0
drop prepond_alt* dups
duplicates drop

duplicates report vicid inv_caseid alleg perpid
/*
--------------------------------------
   copies | observations       surplus
----------+---------------------------
        1 |      4771900             0
--------------------------------------
*/

**Save clean allegations data
compress
sort vicid inv_caseid alleg perpid
order vicid inv_caseid alleg perpid prepond perp_relation
save "${cleandata}allegations_clean.dta", replace

*************************
**3) COLLAPSE TO CHILD*CASE LEVEL
*************************

**Load clean allegations data
use "${cleandata}allegations_clean.dta", clear

**Allegations Information
***there are 26 different allegation categories, many of which contain less than
***1% of all allegations. I'll keep the coding for any category which makes up at 
***least 1% of all allegations and code the rest as "other." Importantly, record
***all incidents involving sexual abuse as such because I want to throw these
***incidents out of my analysis since more experienced caseworkers are automatically
***assigned to the most sensitive cases.
tab alleg, m
/*

                             Allegation |      Freq.     Percent        Cum.
----------------------------------------+-----------------------------------
                                        |        393        0.01        0.01
                            Abandonment |     22,066        0.46        0.47
                            Birth Match |      4,101        0.09        0.56
                            Child Death |        475        0.01        0.57
           Dependency - Sex Trafficking |         23        0.00        0.57
                      Domestic Violence |    179,569        3.76        4.33
                    Drug Exposed Infant |     22,010        0.46        4.79
       Drug Residence - Methamphetamine |     54,701        1.15        5.94
Drug Residence - Other Than Methamphe.. |     25,826        0.54        6.48
                     Failure To Protect |    386,587        8.10       14.58
                   Improper Supervision |  1,166,108       24.44       39.02
        Intra-Familial Sibling Violence |     15,751        0.33       39.35
                           Maltreatment |    328,330        6.88       46.23
                    Medical Child Abuse |        658        0.01       46.24
                        Medical Neglect |     89,133        1.87       48.11
                          Mental Injury |     46,939        0.98       49.09
                         Physical Abuse |    496,994       10.42       59.51
     Physical Abuse - Labor Trafficking |         15        0.00       59.51
                       Physical Neglect |    850,679       17.83       77.34
                 Severe Physical Injury |      7,612        0.16       77.49
            Sex Abuse - Sex Trafficking |         94        0.00       77.50
                           Sexual Abuse |     95,480        2.00       79.50
                         Sexual Contact |     25,078        0.53       80.02
                    Sexual Exploitation |      5,291        0.11       80.13
                     Sexual Penetration |     10,480        0.22       80.35
                   Shaken Baby Syndrome |        768        0.02       80.37
                        Substance Abuse |    280,332        5.87       86.24
                        Threatened Harm |    656,407       13.76      100.00
----------------------------------------+-----------------------------------
                                  Total |  4,771,900      100.00
*/

foreach x in "Abandonment" "Birth Match" "Child Death" "Dependency - Sex Trafficking" ///
	"Drug Exposed Infant" "Intra-Familial Sibling Violence" "Medical Child Abuse" "Mental Injury" ///
	"Physical Abuse - Labor Trafficking" "Severe Physical Injury" "Sex Abuse - Sex Trafficking" ///
	"Shaken Baby Syndrome" {
		replace alleg="" if alleg=="`x'"
}

replace alleg="Drug Residence" if regexm(alleg, "Drug")
replace alleg="Sexual Abuse" if regexm(alleg, "Sex")

replace alleg="Other" if alleg==""

**Create binary variables for each type of allegation to make collapsing the data
**simpler
gen domvi=alleg=="Domestic Violence"
gen drugres=alleg=="Drug Residence"
gen failprot=alleg=="Failure To Protect"
gen impsup=alleg=="Improper Supervision"
gen maltreat=alleg=="Maltreatment"
gen medneg=alleg=="Medical Neglect"
gen alleg_oth=alleg=="Other"
gen phyab=alleg=="Physical Abuse"
gen phyneg=alleg=="Physical Neglect"
gen sexab=alleg=="Sexual Abuse"
gen subab=alleg=="Substance Abuse"
gen threat_harm=alleg=="Threatened Harm"

**Create binary variables for perpetrator relation to make collapsing the data
**simpler
gen mom=perp_relation=="Mother"
gen dad=perp_relation=="Father"
gen notrel=perp_relation=="Not Related"
gen parent_unkn=perp_relation=="Parent (unknown M or F)"
gen rel=perp_relation=="Relative"

**Create binary variable for preponderance
gen preponderance=0
replace preponderance=1 if prepond=="Preponderance"
replace preponderance=. if prepond==""

**Drop files that will not be part of the reshaped wide data
drop alleg perpid prepond perp_relation

**Create interaction variables between allegation, perp_relation and preponderance
duplicates drop
global alleg domvi drugres failprot impsup maltreat medneg alleg_oth phyab phyneg sexab subab threat_harm
global rel mom dad notrel parent_unkn rel
foreach a in $alleg {
	foreach r in $rel {
		gen `a'_`r'_prep=.
		replace `a'_`r'_prep=0 if `a'==1 & `r'==1 & preponderance==0
		replace `a'_`r'_prep=1 if `a'==1 & `r'==1 & preponderance==1
	}
}

**Collapse to child*case level
ds vicid inv_caseid, not
local r=r(varlist)
collapse (max) `r', by(vicid inv_caseid)

sort vicid inv_caseid
compress
save "${cleandata}alleg_child_case_level.dta", replace





































